**----------------------------------------------------------------------------**
**--Local Corruption & Household Business Tax Compliance
**--Duong Le, Eddy Malesky, Anh Pham
**--Journal of Economic Behavior and Organization (July 2020)
**----------------------------------------------------------------------------**
**--Do Files 
**-1. Constructing pairwise VHBS dataset, with proximity pairing (commune pairs 1km, 2km, 3km apart)
**Step 1: merge firms from VHBS 2017 into commune-pair directory
**Step 2: merge in other variables/covariates from various other data sets

**-2. Appending "commune_data_robust.dta" with all variables for regression
**----------------------------------------------------------------------------**

clear all 
set matsize 11000 

**specify your main directory that stores all replication files
global main_directory = "insert your working directory here" 

//set data working directory
cd "$main_directory\data\sub_data" 

**----------------------------------------------------------------------------**
**-1. Constructing pairwise VHBS dataset, with proximity pairing (commune pairs 1km, 2km, 3km apart)
**Step 1: merge firms from VHBS 2017 into commune-pair directory
**----------------------------------------------------------------------------**


**merging together the "skeleton" datasets (i.e., with just commune IDs & firm IDs)
use pairwise_full_GIS_AtoA_3km_IDonly.dta, clear
sort com_pairID_new

preserve
keep if com_pairID_new==1
merge 1:m xa using VHBS2017_IDonly.dta, gen(merge) 
save firm_commune_pair_proximity_3km_IDonly.dta, replace
restore

forvalues x = 2(1)4000 { //note: 4000 are maximum iterations for a loop, so need to separate into 2 loops
	display `x'

	preserve
	keep if com_pairID_new==`x'
	
	merge 1:m xa using VHBS2017_IDonly.dta, gen(merge) 

	keep if merge==3
	drop merge
	*order com_pairID tinh huyen xa id_nganh id_firm
	*sort tinh huyen xa

	append using firm_commune_pair_proximity_3km_IDonly.dta, force
	save firm_commune_pair_proximity_3km_IDonly.dta, replace
	restore
	}


	
**---Second loop---**
use pairwise_full_GIS_AtoA_3km_IDonly.dta, clear
sort com_pairID_new

preserve
keep if com_pairID_new==4001
merge 1:m xa using VHBS2017_IDonly.dta, gen(merge) 
save firm_commune_pair_proximity_3km_IDonly_loop2.dta, replace
restore

forvalues x = 4002(1)7243 { 
	display `x'

	preserve
	keep if com_pairID_new==`x'
	
	merge 1:m xa using VHBS2017_IDonly.dta, gen(merge) 

	keep if merge==3
	drop merge

	append using _firm_commune_pair_proximity_3km_IDonly_loop2.dta, force
	save firm_commune_pair_proximity_3km_IDonly_loop2.dta, replace
	restore
	}

	
**---Merge two loop parts together---**	
use firm_commune_pair_proximity_3km_IDonly.dta, clear 
append using "firm_commune_pair_proximity_3km_IDonly_loop2.dta", force
sort com_pairID_new
drop if com_pairID_new !=.
drop merge
save "firm_commune_pair_proximity_3km_IDonly.dta", replace	

**--------------------------------------------------------------------------**
*--merge other variables into the ID skeleton--**
cd "$main_directory\data" 
use "sub_data\firm_commune_pair_proximity_3km_IDonly.dta", clear

sort com_pairID tinh huyen xa
merge m:1 com_pairID tinh huyen xa using "sub_data/pairwise_full_GIS_AtoA_3km.dta", ///
keepusing(provinceID_2017 districtID_2017 communeID_2017 source_provinceID_GIS source_districtID_GIS source_communeID_GIS lat lon area_km2 boundary_km borderID ///
pci_rank_1 pci_1 pciECindex_1 pciLAindex_1 pciTindex_1 pciTCindex_1 pciICindex_1 pciBindex_1 pciPindex_1 pciBSSidex_1 pciLTindex_1 pciLIindex_1 ///
pci_id arc_id fid_river river near_x near_y saigon bienhoa mytho dongxoai thudaumot tayninh tanan communeid name_2 name_3 name_4 elev slope precip tem suit_tea suit_rice suit_coconut suit_coffee suit_sugar x y y2 x2 xy Bdr_comm Bdr_prov Bdr_dis dist1698 seg10 seg25 seg50 seg75 seg100 treat1698 fid_prov hcmp cap hcmc) gen(merge)

keep if merge==3
drop merge

order com_pairID tinh huyen xa borderID  
sort com_pairID tinh huyen xa
save "firm_commune_pair_proximity_3km.dta", replace

*merge firm_2017 variables by id_nganh (unique variable)
merge m:1 id_nganh using "VHBS2017.dta", gen(merge)
keep if merge==3 
drop merge
save "firm_commune_pair_proximity_3km.dta", replace

**----------------------------------------------------------------------------**
**Step 2: merging in other variables/covariates from various data sets
**----------------------------------------------------------------------------**
use firm_commune_pair_proximity_3km.dta, clear

drop   provinceN_2017  districtN_2017  communeN_2017

** unit of thue11, thue12, and thue13 is in thousand dong.
label var thue11_r "total tax must pay in the first 6 months"

replace tinh=provinceID_2017 if tinh==.
replace huyen=districtID_2017 if huyen==.

** indicators for economic zone
merge m:1 communeID_2017 using "sub_data/econ_zone.dta"
keep if _merge==3
drop _merge
	
** indicators for formal firms (commune level)
merge m:1 tinh huyen xa using "sub_data/formal_firm_indicators.dta"
gen econ_formal=1 if _merge==3
label var econ_formal "commune with no missing characteristics of formal firms"
keep if _merge==1|_merge==3
drop _merge

** indicators for land characteristics(commune level)
merge m:1 provinceID_2017 districtID_2017 communeID_2017 using "sub_data/land_character.dta", gen(merge)
keep if merge==1|merge==3 
drop merge

** nightlight 2016
merge m:1 provinceID_GIS districtID_GIS communeID_GIS  using "sub_data/light2016.dta"
keep if _merge==3
drop _merge

* rescale corruption variable so that higher number means more corruption. 
replace pciICindex_1= 11-pciICindex_1
label var pciICindex_1 "corruption"

** fixed typo
rename pciBSSidex_1 pciBSSindex_1
	
**creating tax compliance ratio & winsorize variables 
winsor thue11_r, g(taxComp) p(0.01)

g taxComp_imp= taxComp
replace taxComp_imp=0 if taxComp_imp==.& phieu!=""

winsor thue11_r, g(thue11_r99) p(0.01)

foreach var in thue11  lnthudn1 worker_total {
	winsor `var', g(`var'_99) p(0.01)
}

g thue11_r99i=thue11_r99
replace thue11_r99i=0 if thue11_r99==.& phieu!=""

g thue11_99i=thue11_99
replace thue11_99i=0 if thue11_99==.& phieu!=""

foreach var in  lnrev_worker  thue11_worker thue11_worker_hr{
	winsor `var', g(`var'_99) p(0.01)
}

g thue11_worker_99i= thue11_worker_99 if phieu!=""
replace thue11_worker_99i=0 if thue11_worker_99==.& phieu!=""

g thue11_worker_hr_99i= thue11_worker_hr_99 if phieu!=""
replace thue11_worker_hr_99i=0 if thue11_worker_hr_99==.& phieu!=""

foreach var in drev1_2_r thue11_r2 ts1 {
winsor `var', g(`var'_99) p(0.01)
}

g thue11_r2_99i= thue11_r2_99
replace thue11_r2_99i=0 if thue11_r2_99==.& phieu!=""

gen corrupt_worker99 = pciICindex_1*worker_total_99
gen corrupt_operation_hr = pciICindex_1*operation_hr

label var lnthudn1_99 "log revenue"
label var worker_total_99 "\# workers"
	
label var taxComp "tax compliance ratio"
label var taxComp_imp "tax compliance ratio impute 0"	

save "firm_commune_pair_proximity_3km.dta", replace


**----------------------------------------------------------------------------**
**----------------------------------------------------------------------------**
**----------------------------------------------------------------------------**
**-2. Merge "commune_data_robust.dta" with all variables for regression
**----------------------------------------------------------------------------**
**----------------------------------------------------------------------------**
**----------------------------------------------------------------------------**
cd "$main_directory\data" 

use "commune_data_robust.dta", clear

replace tinh=provinceID_2017 if tinh==.
replace huyen=districtID_2017 if huyen==.

** merge VHBS2017_firmCount
preserve
use "sub_data/VHBS2017_IDonly.dta"
gen firmCount=1
collapse (sum) firmCount, by (tinh huyen xa)
gen firmCount_ln = ln(firmCount)
tempfile VHBS2017_firmCount
save `VHBS2017_firmCount' 
restore

merge m:1 tinh huyen xa using `VHBS2017_firmCount' 
keep if _merge==1|_merge==3
drop _merge


** indicators for economic zone
merge m:1 communeID_2017 using "sub_data/econ_zone.dta"
keep if _merge==3
drop _merge
	
** indicators for formal firms (commune level)
merge m:1 tinh huyen xa using "sub_data/formal_firm_indicators.dta"
gen econ_formal=1 if _merge==3
label var econ_formal "commune with no missing characteristics of formal firms"
keep if _merge==1|_merge==3
drop _merge

** indicators for land characteristics(commune level)
merge m:1 provinceID_2017 districtID_2017 communeID_2017 using "sub_data/land_character.dta", gen(merge)
keep if merge==1|merge==3 
drop merge

** nightlight 2016
merge m:1 provinceID_GIS districtID_GIS communeID_GIS  using "sub_data/light2016.dta"
keep if _merge==3
drop _merge

save "commune_data_robust.dta", replace





